import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import random
import cufflinks as cf
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import math
import seaborn as sns
import chart_studio.plotly as pt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from pmdarima import auto_arima
from scipy.stats import boxcox,boxcox_normplot
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,mean_squared_log_error,median_absolute_error
from plotly.offline import iplot,plot,init_notebook_mode,download_plotlyjs
init_notebook_mode()
cf.go_offline()
%matplotlib inline
finalcsv = pd.read_csv('FinalMalaria.csv',index_col='DATE(IST)',parse_dates=['DATE(IST)'])
malaria_cases = pd.read_csv('2005-2019_Malaria_Cases.csv')
malaria_cases.columns = ['Year', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
'2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', 'Sum']
malaria_cases.index = malaria_cases['Year']
malaria_cases.drop(['Year','Sum'],inplace=True,axis=1)
malaria_cases_list = []
for i in range(malaria_cases.shape[1]):
for j in range(malaria_cases.shape[0]):
malaria_cases_list.append(malaria_cases.iloc[j,i])
malaria_cases
date = pd.date_range('01/01/2005',freq='M',periods=180)
fig = px.line(y=malaria_cases_list)
fig.show()
one_size = int(len(malaria_cases_list)*0.999)
one_train,one_test = malaria_cases_list[:one_size],malaria_cases_list[one_size:]
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(malaria_cases_list,period=12)
fig = make_subplots(specs = [[{'secondary_y':True}]])
fig.add_trace(go.Scatter(y=decomposition.trend,name='Trend',mode='lines'))
fig.add_trace(go.Scatter(y=decomposition.seasonal,name='Seasonal',mode='lines'))
fig.add_trace(go.Scatter(y=decomposition.resid,name='Noise',mode='lines'))
fig.add_trace(go.Scatter(y=malaria_cases_list,name='Malaria cases',mode='lines'))
fig.update_layout(title_text='Trend, Seasonality and Noise with Number of Malaria Cases registerd')
from statsmodels.tsa.stattools import acf,pacf
To find the Auto-correlation which will eventually help us to find the Moving Average
from pandas.plotting import autocorrelation_plot
plt.figure(figsize=(10,8))
auto_corr = autocorrelation_plot(finalcsv['Malaria_cases'])
Now, we will find the partial ACF in order to find the ideal value of the AutoRegressive term
partial_corr = pacf(malaria_cases_list,nlags=36)
fig = px.bar(y=partial_corr,title='Partial Auto-correlation fucntion of malaria cases showing last 12 months')
fig.update_xaxes(title_text='Months Prior')
fig.show()
Now, we will find the stationarity of the Data using Dickey–Fuller test.
from adFuller import stationarity_or_not_ad_fuller
stationarity_or_not_ad_fuller(finalcsv['Malaria_cases'])
Since, our p-value is > 0.05 so, we can say our Data is not stationary. That is why we can not use ARIMA models on our data.
single_size = int(len(malaria_cases_list)*0.9999)
single_train,single_test = malaria_cases_list[:single_size],malaria_cases_list[single_size:]
single_model = SARIMAX(single_train, trend='c', order=(1,1,0), seasonal_order=(2,0,0,12),enforce_stationarity=False,
enforce_invertibility=False)
single_fit = single_model.fit()
single_predict = single_fit.forecast(steps=1)
single_predict
print_DF = pd.DataFrame(malaria_cases_list,columns=['Malaria_cases'])
print_DF['Predicted'] = np.NaN
print_DF['Predicted'][-1:] = single_predict
print_DF.index = date
fig = make_subplots()
fig.add_trace(go.Scatter(x=print_DF.index,y=print_DF['Malaria_cases'],name='Given Malaria Disease'))
fig.add_trace(go.Scatter(x=print_DF.index,y=print_DF['Predicted'],name='Predicted Values',mode='markers'))
# print_DF.to_csv('results/predictingCheck.csv',header=True,index=True)
multiple_fit = single_model.fit()
multiple_predict = multiple_fit.forecast(steps=12)
multiple_predict
fig = make_subplots()
fig.add_trace(go.Scatter(x=pd.date_range('01/01/2005',freq='M',periods=180),y=malaria_cases_list,name='Given Malaria Disease'))
fig.add_trace(go.Scatter(x=pd.date_range('12/01/2019',freq='M',periods=12),y=multiple_predict,name='Predicted Values'))
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
malaria_cases_list_final = malaria_cases_list
malaria_cases_list_final[-1] = 145
finalcsv = pd.read_csv('FinalMalaria.csv',index_col='DATE(IST)',parse_dates=True)
finalcsv.iloc[:,9] = malaria_cases_list_final[-120:]
finalcsv.head()
finalcsv_corr = finalcsv.corr()
plt.figure(figsize=(14,14))
sns.heatmap(finalcsv_corr,annot=True)
differenced_malaria = finalcsv.iloc[:,9].diff()
differenced_malaria.iplot()
stationarity_or_not_ad_fuller(differenced_malaria[1:])
malaria_pacf = pacf(differenced_malaria[1:],nlags=48)
px.bar(y=malaria_pacf)
malaria_acf = acf(differenced_malaria[1:],nlags=64,fft=False)
px.bar(y=malaria_acf)
auto_arima(differenced_malaria[1:],start_p=0,start_q=0,start_P=0,start_Q=0,trend='n',m=12,alpha=0.1)
finalcsv_malaria_final = finalcsv[1:]
finalcsv_malaria_final.columns
X = finalcsv_malaria_final['Malaria_cases']
sarima_len = int(len(finalcsv_malaria_final) * 0.90)
sarima_len
sarima_train,sarima_test = X[0:sarima_len],X[sarima_len:len(X)]
sarima_pred = np.empty(len(sarima_test))
sarima_history = [x for x in sarima_train]
for i in range(len(sarima_test)):
sarima_model =SARIMAX(sarima_history, trend='n', order=(1,1,0), seasonal_order=(1,0,1,12),enforce_invertibility=False,
enforce_stationarity=False)
sarima_result = sarima_model.fit()
sarima_output = sarima_result.forecast()
sarima_pred[i] = sarima_output[0]
sarima_history.append(sarima_test[i])
print('predicted={0}/texpected={1}'.format(sarima_output[0],sarima_test[i]))
sarima_pred
fig = make_subplots()
fig.add_trace(go.Scatter(y=sarima_test,name='Given Malaria Disease'))
fig.add_trace(go.Scatter(y=sarima_pred,name='Predicted Values'))
sarima_result.summary()
diseaseList = pd.read_csv('results/Disease_List.csv',index_col='Date')
malaria_list = diseaseList['Malaria'].dropna()
malaria_list[-1] = 145
# single_model = SARIMAX(malaria_list, trend='n', order=(1,1,0), seasonal_order=(1,0,1,12),enforce_stationarity=False,
# enforce_invertibility=False,freq='M')
single_model = SARIMAX(malaria_list, trend='c', order=(1,1,0), seasonal_order=(2,0,0,12),enforce_stationarity=False,
enforce_invertibility=False)
malaria_model = single_model.fit()
predict = malaria_model.forecast(steps=4)
predict
malaria_list_print = np.empty(183)
malaria_list_print[:180] = malaria_list
malaria_list_print[180] = 45
malaria_list_print[181] = 38
malaria_list_print[182] = 75
len(malaria_list_print)
fig = make_subplots()
fig.add_trace(go.Scatter(x=pd.date_range('2019-01-31',freq='m',periods=15),y=malaria_list_print[-15:],name='Given Malaria Disease'))
fig.add_trace(go.Scatter(x=predict.index,y=predict,name='Predicted Values'))
malaria_model.summary()
# single_model = SARIMAX(malaria_list_print, trend='n', order=(1,1,0), seasonal_order=(1,0,1,12),enforce_stationarity=False,
# enforce_invertibility=False)
single_model = SARIMAX(malaria_list_print, trend='c', order=(1,1,0), seasonal_order=(2,0,0,12),enforce_stationarity=False,
enforce_invertibility=False)
malaria_model = single_model.fit()
predict = malaria_model.forecast(steps=4)
predict
fig = make_subplots()
fig.add_trace(go.Scatter(x=pd.date_range('2019-01-31',freq='m',periods=15),y=malaria_list_print[-15:],name='Given Malaria Disease'))
fig.add_trace(go.Scatter(x=pd.date_range('2020-04-30',freq='m',periods=4),y=predict,name='Predicted Values'))
temperature_diff_df = pd.DataFrame(finalcsv['AIR_TEMP(°C)'],columns=["AIR_TEMP(°C)"])
for i in range(1,12):
temperature_diff_df["shifted_",i] = finalcsv['AIR_TEMP(°C)'].shift(i)
temperature_diff_df['Malaria'] = finalcsv['Malaria_cases']
temperature_diff_df_corr = temperature_diff_df.corr()
plt.figure(figsize=(14,14))
sns.heatmap(temperature_diff_df_corr,annot=True)
rainfall_diff_df = pd.DataFrame(finalcsv['RAIN_FALL(mm)'],columns=["RAIN_FALL(mm)"])
for i in range(1,12):
rainfall_diff_df["shifted_",i] = finalcsv['RAIN_FALL(mm)'].shift(i)
rainfall_diff_df['Malaria'] = finalcsv['Malaria_cases']
rainfall_diff_df_corr = rainfall_diff_df.corr()
plt.figure(figsize=(14,14))
sns.heatmap(rainfall_diff_df_corr,annot=True)
min_temperature_diff_df = pd.DataFrame(finalcsv['Min_temp'],columns=["Min_temp"])
for i in range(1,12):
min_temperature_diff_df["shifted_",i] = finalcsv['Min_temp'].shift(i)
min_temperature_diff_df['Malaria'] = finalcsv['Malaria_cases']
min_temperature_diff_df_corr = min_temperature_diff_df.corr()
plt.figure(figsize=(14,14))
sns.heatmap(min_temperature_diff_df_corr,annot=True)
pressure_diff_df = pd.DataFrame(finalcsv['ATMO_PRESSURE(hpa)'],columns=["ATMO_PRESSURE(hpa)"])
for i in range(1,12):
pressure_diff_df["shifted_",i] = finalcsv['ATMO_PRESSURE(hpa)'].shift(i)
pressure_diff_df['Malaria'] = finalcsv['Malaria_cases']
pressure_diff_df_corr = pressure_diff_df.corr()
plt.figure(figsize=(14,14))
sns.heatmap(pressure_diff_df_corr,annot=True)
plt.figure(figsize=(14,14))
sns.heatmap(finalcsv_corr,annot=True)
fig = make_subplots(specs=[[{"secondary_y":True}]])
fig.add_trace(go.Scatter(x=finalcsv.index,y=finalcsv['AIR_TEMP(°C)'],name='AIR_TEMP(°C)'),secondary_y=True)
fig.add_trace(go.Scatter(x=finalcsv.index,y=finalcsv['HUMIDITY(%)'],name='HUMIDITY(%)'),secondary_y=True)
fig.add_trace(go.Scatter(x=finalcsv.index,y=finalcsv['ATMO_PRESSURE(hpa)'],name='ATMO_PRESSURE(hpa)'),secondary_y=True)
fig.add_trace(go.Scatter(x=finalcsv.index,y=finalcsv['WIND_SPEED(m/s)'],name='WIND_SPEED(m/s)'),secondary_y=True)
fig.add_trace(go.Scatter(x=finalcsv.index,y=finalcsv['RAIN_FALL(mm)'],name='RAIN_FALL(mm)'),secondary_y=True)
fig.add_trace(go.Scatter(x=finalcsv.index,y=finalcsv['Malaria_cases'],name='Malaria'))
malaria_df = finalcsv.copy()
malaria_df.columns
malaria_df = malaria_df.drop(['ALTITUDE(m)','WIND_SPEED(m/s)','WIND_DIRECTION(deg)','Month','Max_temp',
'Max_humidity','Max_pressure', 'Min_pressure'],axis=1)
malaria_df['Temperature_shifted_1'] = malaria_df['AIR_TEMP(°C)'].shift(1)
malaria_df['Rainfall_shifted_1'] = malaria_df['RAIN_FALL(mm)'].shift(1)
malaria_df['min_temp_shifted_1'] = malaria_df['Min_temp'].shift(1)
malaria_df['pressure_shifted_1'] = malaria_df['ATMO_PRESSURE(hpa)'].shift(1)
malaria_df_final = malaria_df.dropna()
malaria_df_final_corr = malaria_df_final.corr()
plt.figure(figsize=(12,12))
sns.heatmap(malaria_df_final_corr,annot=True)
malaria_df_final.columns
malaria_df.iloc[-20,5] = 406
malaria_df.iloc[-19,5] = 435
malaria_df.iloc[-6,5] = 502
malaria_df.iloc[-5,5] = 650
logged_malaria= np.log(malaria_df['Malaria_cases'])
px.line(y=logged_malaria)
logged_dif = logged_malaria.diff()
logged_dif[10:].plot()
px.line(y=logged_dif)
stationarity_or_not_ad_fuller(finalcsv['Malaria_cases'])
stationarity_or_not_ad_fuller(logged_dif[1:])
auto_arima(logged_dif[1:],start_p=0,start_q=0,start_P=0,start_Q=0,trend='n',m=12,alpha=0.9,error_action='ignore',d=1)
malaria_acf = acf(logged_dif[1:],nlags=64,fft=False)
px.bar(y=malaria_acf)
malaria_pacf = pacf(logged_dif[1:],nlags=36)
px.bar(y=malaria_pacf)
X = logged_dif[1:]
sarima_len = int(len(X) * 0.8)
sarima_train,sarima_test = X[0:sarima_len],X[sarima_len:len(X)]
pred_temp
sarima_pred = []
sarima_history = [x for x in sarima_train]
for i in range(len(sarima_test)):
sarima_model =SARIMAX(sarima_history, trend='n', order=(1,1,1), seasonal_order=(1,1,3,12),enforce_invertibility=False,
enforce_stationarity=False)
sarima_result = sarima_model.fit()
sarima_output = sarima_result.forecast()
sarima_pred.append(sarima_output[0])
sarima_history.append(sarima_test[i])
print('predicted={0}/texpected={1}'.format(sarima_output[0],sarima_test[i]))
# sarima_pred = []
# sarima_history = [x for x in sarima_train]
# for i in range(len(sarima_test)):
# sarima_model =SARIMAX(sarima_history, trend='c', order=(2,0,0), seasonal_order=(1,0,1,12),enforce_invertibility=False,
# enforce_stationarity=False)
# sarima_result = sarima_model.fit()
# sarima_output = sarima_result.forecast()
# sarima_pred.append(sarima_output[0])
# sarima_history.append(sarima_test[i])
# print('predicted={0}/texpected={1}'.format(sarima_output[0],sarima_test[i]))
fig = make_subplots()
fig.add_trace(go.Scatter(y=sarima_test,name='Given Malaria Disease'))
fig.add_trace(go.Scatter(y=sarima_pred,name='Predicted Values'))
real_diff_pred = np.empty(len(sarima_pred))
real_diff_test = np.empty(len(sarima_test))
logged_malaria[-13:]
len(sarima_pred)
for i in range(len(sarima_pred)):
real_diff_pred[i] = logged_malaria[-25+i] + sarima_pred[i]
real_diff_test[i] = logged_malaria[-25+i] + sarima_test[i]
malaria_pred_temp = np.empty(len(real_diff_pred))
malaria_test_temp = np.empty(len(real_diff_test))
malaria_pred_temp = np.exp(real_diff_pred)
malaria_test_temp = np.exp(real_diff_test)
print('Accuracy is {0}%'.format(math.ceil(r2_score(malaria_test_temp,malaria_pred_temp) * 100)))
fig = make_subplots()
fig.add_trace(go.Scatter(y=malaria_test_temp,name='Actual Malaria Cases',mode='lines+markers'))
fig.add_trace(go.Scatter(y=malaria_pred_temp,name='Predicted Values'))
mean_absolute_percentage_error(malaria_test_temp,malaria_pred_temp)
sarima_result.fittedvalues
logged_dif
sarima_result.summary()
# lengh = len(malaria_df['Malaria_cases']) +1
# temp1= np.empty(lengh)
# temp1[:-1] = malaria_df['Malaria_cases']
# temp1[-1] = 45
# len(temp1)
# logged = np.array()
# forecast_malaria
# tem_1 = logged_malaria[-1] + forecast_malaria[0]
# temp_1 = np.exp(tem_1)
# temp_1
# tem_2 = tem_1 + forecast_malaria[1]
# temp_2 = np.exp(tem_2)
# temp_2
# tem_3 = tem_2 + forecast_malaria[2]
# temp_3 = np.exp(tem_3)
# temp_3
# malaria_df['Malaria_cases']
# lengh = len(malaria_df['Malaria_cases']) +1
# temp1= np.empty(lengh)
# temp1[:-1] = malaria_df['Malaria_cases']
# temp1[-1] = 45
# # temp1[-1] = 38
# len(temp1)
# logged_df = pd.DataFrame(temp1,columns=['malaria_temp'])
# logged_df.index = pd.date_range('2010-01-31',freq='M',periods=121)
# logged_df_log = np.log(logged_df['malaria_temp'])
# logged_df_log_diff = logged_df_log.diff()
# logged_final_jan = logged_df_log_diff.dropna()
# single_model = SARIMAX(logged_final_jan, trend='n', order=(1,1,1), seasonal_order=(1,1,3,12),enforce_stationarity=False,
# enforce_invertibility=False)
# malaria_model = single_model.fit()
# forecast_malaria = malaria_model.forecast(steps=4)
# forecast_malaria
# logged_df_log[-2:]
# logged_final_jan[-2:]
# tem_1 = logged_df_log[-1] + forecast_malaria[0]
# temp_1 = np.exp(tem_1)
# temp_1
# tem_2 = tem_1 + forecast_malaria[1]
# temp_2 = np.exp(tem_2)
# temp_2
# tem_3 = tem_2 + forecast_malaria[2]
# temp_3 = np.exp(tem_3)
# temp_3
# len(logged_dif)
# len(finalcsv['Malaria_cases'])
# fig = make_subplots(rows=1, cols=2)
# fig.add_trace(
# go.Scatter(y=logged_dif[12:]),
# row=1, col=2
# )
# fig.add_trace(
# go.Scatter(y=finalcsv.iloc[12:,9]),
# row=1, col=1
# )
# fig.update_layout(height=400, width=800, title_text=" Before transformation After transformation")
# plt.figure(figsize=(10,8))
# auto_corr = autocorrelation_plot(logged_dif[1:])
# from statsmodels.graphics.tsaplots import plot_pacf
# plot_pacf(logged_dif[1:], lags=48)
# malaria_list[-12:]
# finalcsv.iloc[-12:,9]